Hungarian Secondary School Leaving Exam Analysis

Importing libraries

library(spatstat)
library(here)
library(sp)
library(maptools)
library(tmap)
library(sf)
library(tmaptools)
library(dplyr)
library(readr)
library(glue)
library(spdep)

Reading Data Secondary School Data

  • Secondary school data (source oktatas.hu)
  • Filter out the school without SELE and remove duplicates
  • Set projection and transform it to st format
SchoolData <- read_csv(here::here('data', 'school_data_Google_location_highschool_cleaned.csv'), 
         na = c("NA", "n/a", ""))

SchoolData <- SchoolData %>% dplyr::filter(!is.na(hu_midlevel))
SchoolData <- SchoolData[!duplicated(SchoolData[ , c('school_coords','hist_midlevel', 'eng_lang_midlevel', 'hu_midlevel', 'math_midlevel')]),]
projcrs <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
SchoolData <- st_as_sf(x= SchoolData,
                         coords=c('longitude','latitude'),
                         crs=projcrs)
st_crs(SchoolData) <- projcrs 

SchoolData <- SchoolData %>%
  st_transform(.,23700)

Reading Hungary Shapefile from GDAM and Top Cities

  • Top cities data source is simplemaps.com
  • Hungary shape file data source is GDAM
TopCities <- read_csv("https://simplemaps.com/static/data/country-cities/hu/hu.csv")
TopCities <- st_as_sf(x= TopCities,
                       coords=c('lng','lat'),
                       crs=projcrs)
st_crs(TopCities) <- projcrs 

### Hungary Shape
HungaryShapeMap <- st_read(here::here('data', 'gadm36_HUN_shp','gadm36_HUN_2.shp'))
## Reading layer `gadm36_HUN_2' from data source `C:\Users\botiv\OneDrive - University College London\CASA_2020\Modules\GIS and Science\Assessment\data\gadm36_HUN_shp\gadm36_HUN_2.shp' using driver `ESRI Shapefile'
## Simple feature collection with 168 features and 13 fields
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: 16.11105 ymin: 45.74783 xmax: 22.90558 ymax: 48.58638
## geographic CRS: WGS 84
## Hungary's epsg: 23700
HungaryShapeMap <- HungaryShapeMap %>%
  st_transform(., 23700)

Merge the School data with the Shapefile

  • merge data and clean column headers with janitor
  • calcualte school density of areas
HungarySchools <- HungaryShapeMap%>%
  st_join(SchoolData)%>%
  add_count(GID_2)%>%
  janitor::clean_names()%>%
  mutate(area=st_area(.))%>%
  mutate(density=n/area)

Create the language score for all languages and create an overall language score

  • Looping through all the foreign languages. Both intermediate and advance level SELE exams are included in the calculations.
  • Generate the unified FLPS (foreign language proficiency score)
lang_cols = c('eng','hebrew','spanish','lovari','holland','slovakian','esperanto','croatian','greek','chinese','german','romanian','french','polish','portuguese','italian','japanese','serbian','russian','beas')

HungaryLanugageAll <- HungaryShapeMap
for (l in lang_cols) {
  
  var_lang_a_prodsum =rlang::sym(paste(l,"_lang_a_prodsum",sep = ""))
  var_lang_midlevel_prodsum =rlang::sym(paste(l,"_lang_midlevel_prodsum",sep = ""))
  
  var_prodsum =rlang::sym(paste(l,"_prodsum",sep = ""))
  var_stud_count =rlang::sym(paste(l,"_stud_count",sep = ""))
  
  var_lang_a <- rlang::sym(paste(l,"_lang_a",sep = ""))
  var_lang_a_percent <- rlang::sym(paste(l,"_lang_a_percent",sep = ""))
  var_lang_midlevel <- rlang::sym(paste(l,"_lang_midlevel",sep = ""))
  var_lang_midlevel_percent <- rlang::sym(paste(l,"_lang_midlevel_percent",sep = ""))
  var_score <- rlang::sym(paste(l,"_score",sep = ""))
  
  ### Creating extra columns 
  HungarySchools <- HungarySchools %>% 
    mutate(., (!!var_lang_a_prodsum) := ifelse((!!var_lang_a_percent) >= 45, ((!!var_lang_a_percent) + 50) * (!!var_lang_a) , (!!var_lang_a_percent)  * (!!var_lang_a))) %>%
    mutate(., (!!var_lang_midlevel_prodsum) := (!!var_lang_midlevel) * (!!var_lang_midlevel_percent)) %>%
    
    mutate(., (!!var_prodsum ):=  ifelse(is.na((!!var_lang_a_prodsum)), 0, (!!var_lang_a_prodsum)) +  ifelse(is.na((!!var_lang_midlevel_prodsum)), 0, (!!var_lang_midlevel_prodsum))) %>%
    mutate(., (!!var_stud_count) :=  ifelse(is.na((!!var_lang_a)), 0, (!!var_lang_a)) +  ifelse(is.na((!!var_lang_midlevel)), 0, (!!var_lang_midlevel)))
  
  ## Zeros are NA - 
  HungarySchools[[var_stud_count]] <- HungarySchools[[var_stud_count]] %>% replace(.==0, NA)
  HungarySchools[[var_prodsum]] <- HungarySchools[[var_prodsum]] %>% replace(.==0, NA)
  
  
  ### Groupby
  HungarySchoolsLanguage<- HungarySchools %>%                    
    group_by(gid_2) %>% 
    summarise(
      name_2= first(name_2),
      !!var_prodsum := sum(!!var_prodsum , na.rm = TRUE),
      !!var_stud_count := sum(!!var_stud_count, na.rm = TRUE),
    ) %>%
    mutate(., !!var_score  := (!!var_prodsum)  / (!!var_stud_count))
  
  ### Merging them into a large Dataset with all languages - covert them to data.frame - later we convert them back
  HungaryLanugageAll<- merge(HungaryLanugageAll %>% as.data.frame(),HungarySchoolsLanguage %>% as.data.frame(), by.x="GID_2", by.y="gid_2", no_dups=TRUE )
  
  #print(l)
   
}


## Prodsum cols
prodsum_cols <-  HungaryLanugageAll %>%
  select(., contains('_prodsum')) %>%
  colnames(.)
## Number of Student  columns
stud_cols <-  HungaryLanugageAll %>%
  select(., contains('_stud_count')) %>%
  colnames(.)

### Creating over all columns prodsum, all student, score 
HungaryLanugageAll$all_prodsum <- rowSums(HungaryLanugageAll[prodsum_cols], na.rm = TRUE)
HungaryLanugageAll$all_stud <- rowSums(HungaryLanugageAll[stud_cols], na.rm = TRUE)


### Convert back to SF
HungaryLanugageAll <- st_as_sf(HungaryLanugageAll)


## Final Language Score column
HungaryLanugageAll <- HungaryLanugageAll %>% 
  mutate(., all_score = all_prodsum / all_stud )

Total Number of Exam taken:

sum(HungaryLanugageAll$all_stud)
## [1] 59155

Foregin Language Score Map - For paper

### MAP OF THE LANGUAGE TOTAL SCORE
tm_shape(HungaryLanugageAll) +
  tm_polygons(col="all_score",
              style = "jenks",
              palette = c('#ffffff','#ebf0ff','#bdd7e7' , '#3182bd', '#08519c'),
              #breaks = c(0, 50, 70, 80,110),
              midpoint=NA,
              legend.hist = TRUE,
              border.alpha = 0.3,
              popup.vars=c("NAME_2", "all_score"),
              title="Language Score") +
tm_shape(slice_head(TopCities, n=10)) + 
  tm_dots(col = 'red', legend.z = 'city'  , size = 0.2) +
  #tm_symbols(col = 'grey'  , size = 'population') +
tm_shape(slice_head(TopCities, n=1)) + 
 tm_text('city',col='white',shadow = FALSE , size=0.6, ymod = 0.3,fontface="bold") + 
tm_layout(  legend.hist.bg.color = '#dedede',
            legend.title.size = 1,
            legend.outside.size=0.5,
            legend.outside = TRUE,
            legend.outside.position = 'right',
            frame=FALSE
            ) +
tm_scale_bar(position = c("right", "bottom"), text.size = 0.40)+
tm_compass(type = "8star", size = 1, position = c(0.83, 0.15))

Foregin Language Score Map - Interactive

### MAP OF THE LANGUAGE TOTAL SCORE
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(HungaryLanugageAll) +
  tm_polygons(col="all_score",
              style = "jenks",
              palette = c('#ffffff','#ebf0ff','#bdd7e7' , '#3182bd', '#08519c'),
              #breaks = c(0, 50, 70, 80,110),
              midpoint=NA,
              legend.hist = TRUE,
              border.alpha = 0.3,
              popup.vars=c("NAME_2", "all_score"),
              title="Language Score") +
tm_shape(slice_head(TopCities, n=10)) + 
  tm_dots(col = 'red', legend.z = 'city'  , size = 0.2) +
  #tm_symbols(col = 'grey'  , size = 'population') +
tm_shape(slice_head(TopCities, n=1)) + 
 tm_text('city',col='white',shadow = FALSE , size=0.6, ymod = 0.3,fontface="bold") + 
tm_layout(  legend.hist.bg.color = '#dedede',
            legend.title.size = 1,
            legend.outside.size=0.5,
            legend.outside = TRUE,
            legend.outside.position = 'right',
            frame=FALSE
            )

Location Quotient calucaltion - for paper

  • to observe how areas differ from the national average
## Calculate the SELE language exams average by school
HungaryAverage = sum(HungaryLanugageAll$all_prodsum) / sum(HungaryLanugageAll$all_stud)

## Create the LQ column
HungaryLanugageAll$all_score_LQ<-ifelse(is.na(HungaryLanugageAll$all_score), NA, HungaryLanugageAll$all_score/HungaryAverage)



#https://colorbrewer2.org/#type=sequential&scheme=Purples&n=5
 
tmap_mode("plot")
## tmap mode set to plotting
tm_shape(HungaryLanugageAll) +
  tm_polygons(col="all_score_LQ",
              style = "jenks",
              palette = 'RdBu',
              #breaks = c(0, 50, 70, 80,110),
              midpoint=NA,
              legend.hist = TRUE,
              border.alpha = 0.3,
              popup.vars=c("NAME_2", "all_score"),
              title="Language Score Location Quotient") +
  tm_shape(slice_head(TopCities, n=10)) + 
  tm_dots(col = 'red', legend.z = 'city'  , size = 0.2) +
  #tm_symbols(col = 'grey'  , size = 'population') +
  tm_shape(slice_head(TopCities, n=1)) + 
  tm_text('city',col='white',shadow = FALSE , size=0.6, ymod = 0.3,fontface="bold") + 
  tm_layout(  legend.hist.bg.color = '#dedede',
              legend.title.size = 1,
              legend.outside.size=0.5,
              legend.outside = TRUE,
              legend.outside.position = 'right',
              frame=FALSE
  ) +
  tm_scale_bar(position = c("right", "bottom"), text.size = 0.40)+
  tm_compass(type = "8star", size = 1, position = c(0.83, 0.15))

### Location Quotient calucaltion - Interactive

tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(HungaryLanugageAll) +
  tm_polygons(col="all_score_LQ",
              style = "jenks",
              palette = 'RdBu',
              #breaks = c(0, 50, 70, 80,110),
              midpoint=NA,
              legend.hist = TRUE,
              border.alpha = 0.3,
              popup.vars=c("NAME_2", "all_score"),
              title="Language Score Location Quotient") +
  tm_shape(slice_head(TopCities, n=10)) + 
  tm_dots(col = 'red', legend.z = 'city'  , size = 0.2) +
  #tm_symbols(col = 'grey'  , size = 'population') +
  tm_shape(slice_head(TopCities, n=1)) + 
  tm_text('city',col='white',shadow = FALSE , size=0.6, ymod = 0.3,fontface="bold") + 
  tm_layout(  legend.hist.bg.color = '#dedede',
              legend.title.size = 1,
              legend.outside.size=0.5,
              legend.outside = TRUE,
              legend.outside.position = 'right',
              frame=FALSE
  )

Global Moran’s I

  • to observe the possibility of spatial autocorrelation
## Drop NaNs
HungaryLanugageAll <- HungaryLanugageAll %>% dplyr::filter(!is.na(all_score))

coordsW <- HungaryLanugageAll%>%
  st_centroid()%>%
  st_geometry()
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over
## geometries of x
plot(coordsW,axes=TRUE)

#create a neighbours list
HungaryLanugageAll_nb <- HungaryLanugageAll %>%
  spdep::poly2nb(., queen=T)


### Plot
tmap_mode("plot")
## tmap mode set to plotting
plot(HungaryLanugageAll_nb, st_geometry(coordsW), col="red")
#add a map underneath
plot(HungaryLanugageAll$geometry, add=T)

#create a spatial weights object from these weights
HungaryLanugageAll.lw <- HungaryLanugageAll_nb %>%
  spdep::nb2listw(., style="C")

### Moran's I

##all score the variable
I_HungaryLanugageAll_Global_Density <- HungaryLanugageAll %>%
  pull(all_score) %>%
  as.vector()%>%
  spdep::moran.test(., HungaryLanugageAll.lw)

HungaryLanugageAll.lw
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 158 
## Number of nonzero links: 820 
## Percentage nonzero weights: 3.28473 
## Average number of links: 5.189873 
## 
## Weights style: C 
## Weights constants summary:
##     n    nn  S0      S1       S2
## C 158 24964 158 60.8878 681.9434
I_HungaryLanugageAll_Global_Density
## 
##  Moran I test under randomisation
## 
## data:  .  
## weights: HungaryLanugageAll.lw    
## 
## Moran I statistic standard deviate = 4.3938, p-value = 5.569e-06
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.206107125      -0.006369427       0.002338512

Getis Ord G*

  • to highlight hot and cold spots in the FLPS of areas
Gi_FL_score_Local_Density <-  HungaryLanugageAll %>%
  pull(all_score) %>%
  as.vector()%>%
  localG(., HungaryLanugageAll.lw)


HungaryLanugageAll <- HungaryLanugageAll %>%
  mutate(density_G = as.numeric(Gi_FL_score_Local_Density))


breaks1<-c(-1000,-2.58,-1.96,-1.65,1.65,1.96,2.58,1000)
tmap_mode("plot")
## tmap mode set to plotting
tm_shape(HungaryLanugageAll) +
  tm_polygons("density_G",
              palette="-RdBu",
              style="fixed",
              breaks=breaks1,
              midpoint=NA,
              title="Gi*, FL capabilities Hot-Cold Spot in Hungary")+
tm_shape(slice_head(TopCities, n=10)) + 
  tm_dots(col = 'red', legend.z = 'city'  , size = 0.2) +
  #tm_symbols(col = 'grey'  , size = 'population') +
  tm_shape(slice_head(TopCities, n=1)) + 
  tm_text('city',col='white',shadow = FALSE , size=0.6, ymod = 0.3,fontface="bold") + 
  tm_layout(  legend.hist.bg.color = '#dedede',
              legend.title.size = 1,
              legend.outside.size=0.5,
              legend.outside = TRUE,
              legend.outside.position = 'right',
              frame=FALSE
  ) +
  tm_scale_bar(position = c("right", "bottom"), text.size = 0.40)+
  tm_compass(type = "8star", size = 1, position = c(0.83, 0.15))